import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
#pd.set_option('precision',3)
generation_data = pd.read_csv("Plant_1_Generation_Data.csv")
generation_data.sample(5).style.set_properties(
**{
'background-color': 'OliveDrab',
'color': 'white',
'border-color': 'darkblack'
})
| DATE_TIME | PLANT_ID | SOURCE_KEY | DC_POWER | AC_POWER | DAILY_YIELD | TOTAL_YIELD | |
|---|---|---|---|---|---|---|---|
| 10508 | 20-05-2020 04:30 | 4135001 | adLQvlD726eNBSB | 0.000000 | 0.000000 | 0.000000 | 6304513.000000 |
| 54847 | 11-06-2020 09:00 | 4135001 | uHbuxQJl8lW7ozc | 3864.250000 | 379.737500 | 502.625000 | 7241010.625000 |
| 65902 | 16-06-2020 14:45 | 4135001 | WRmjgnKYAwPKWDb | 7727.125000 | 755.975000 | 5198.250000 | 7266513.250000 |
| 8416 | 19-05-2020 03:15 | 4135001 | wCURE6d3bPkepu2 | 0.000000 | 0.000000 | 0.000000 | 6808393.000000 |
| 28319 | 29-05-2020 16:30 | 4135001 | ih0vzX44oOqAx2f | 4015.714286 | 394.571429 | 7615.857143 | 6294785.857000 |
weather_data = pd.read_csv("Plant_1_Weather_Sensor_Data.csv")
weather_data.sample(5).style.set_properties(
**{
'background-color': 'pink',
'color': 'Black',
'border-color': 'darkblack'
})
| DATE_TIME | PLANT_ID | SOURCE_KEY | AMBIENT_TEMPERATURE | MODULE_TEMPERATURE | IRRADIATION | |
|---|---|---|---|---|---|---|
| 3141 | 2020-06-17 13:45:00 | 4135001 | HmiyD2TTLFNqkNe | 29.057731 | 45.051337 | 0.599461 |
| 704 | 2020-05-22 22:30:00 | 4135001 | HmiyD2TTLFNqkNe | 23.547205 | 22.648830 | 0.000000 |
| 3068 | 2020-06-16 19:30:00 | 4135001 | HmiyD2TTLFNqkNe | 24.442285 | 23.068772 | 0.000000 |
| 3149 | 2020-06-17 15:45:00 | 4135001 | HmiyD2TTLFNqkNe | 23.556775 | 23.536192 | 0.085784 |
| 890 | 2020-05-24 22:30:00 | 4135001 | HmiyD2TTLFNqkNe | 24.650606 | 23.198922 | 0.000000 |
def parse_datetime(date_str):
for fmt in ('%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S', '%d-%m-%Y %H:%M', '%d-%m-%Y %H:%M:%S'):
try:
return pd.to_datetime(date_str, format=fmt)
except ValueError:
pass
raise ValueError(f"No valid date format found for {date_str}")
generation_data['DATE_TIME'] = generation_data['DATE_TIME'].apply(parse_datetime)
weather_data['DATE_TIME'] = weather_data['DATE_TIME'].apply(parse_datetime)
generation_data['DATE_TIME'] = generation_data['DATE_TIME'].dt.strftime('%Y-%m-%d %H:%M')
weather_data['DATE_TIME'] = weather_data['DATE_TIME'].dt.strftime('%Y-%m-%d %H:%M:%S')
generation_data['DATE_TIME'] = pd.to_datetime(generation_data['DATE_TIME'],format = '%Y-%m-%d %H:%M')
weather_data['DATE_TIME'] = pd.to_datetime(weather_data['DATE_TIME'],format = '%Y-%m-%d %H:%M:%S')
df_solar = pd.merge(generation_data.drop(columns = ['PLANT_ID']), weather_data.drop(columns = ['PLANT_ID', 'SOURCE_KEY']), on='DATE_TIME')
df_solar.sample(5).style.background_gradient(cmap='cool')
| DATE_TIME | SOURCE_KEY | DC_POWER | AC_POWER | DAILY_YIELD | TOTAL_YIELD | AMBIENT_TEMPERATURE | MODULE_TEMPERATURE | IRRADIATION | |
|---|---|---|---|---|---|---|---|---|---|
| 52827 | 2020-06-10 10:15:00 | 7JYdWkrLSPkdwr4 | 6629.625000 | 649.375000 | 1699.625000 | 7796428.625000 | 27.128528 | 38.266939 | 0.480099 |
| 18586 | 2020-05-24 16:30:00 | WRmjgnKYAwPKWDb | 3600.500000 | 353.000000 | 7716.625000 | 7100742.625000 | 32.829294 | 40.124707 | 0.242545 |
| 46364 | 2020-06-07 08:45:00 | ZnxXDlPa8U1GXgE | 5111.000000 | 501.225000 | 585.125000 | 6693639.125000 | 23.404236 | 31.958346 | 0.341317 |
| 32196 | 2020-05-31 13:30:00 | 3PZuoBAID5Wc2HD | 5162.625000 | 505.587500 | 4335.875000 | 7113823.875000 | 30.101738 | 48.065961 | 0.411193 |
| 29324 | 2020-05-30 04:45:00 | iCRJl6heRkivqQ3 | 0.000000 | 0.000000 | 0.000000 | 7291253.000000 | 21.293420 | 20.270673 | 0.000000 |
# adding separate time and date columns
df_solar["DATE"] = pd.to_datetime(df_solar["DATE_TIME"]).dt.date
df_solar["TIME"] = pd.to_datetime(df_solar["DATE_TIME"]).dt.time
df_solar['DAY'] = pd.to_datetime(df_solar['DATE_TIME']).dt.day
df_solar['MONTH'] = pd.to_datetime(df_solar['DATE_TIME']).dt.month
df_solar['WEEK'] = pd.to_datetime(df_solar['DATE_TIME']).dt.week
# add hours and minutes for ml models
df_solar['HOURS'] = pd.to_datetime(df_solar['TIME'],format='%H:%M:%S').dt.hour
df_solar['MINUTES'] = pd.to_datetime(df_solar['TIME'],format='%H:%M:%S').dt.minute
df_solar['TOTAL MINUTES PASS'] = df_solar['MINUTES'] + df_solar['HOURS']*60
# add date as string column
df_solar["DATE_STRING"] = df_solar["DATE"].astype(str) # add column with date as string
df_solar["HOURS"] = df_solar["HOURS"].astype(str)
df_solar["TIME"] = df_solar["TIME"].astype(str)
df_solar.head(2)
| DATE_TIME | SOURCE_KEY | DC_POWER | AC_POWER | DAILY_YIELD | TOTAL_YIELD | AMBIENT_TEMPERATURE | MODULE_TEMPERATURE | IRRADIATION | DATE | TIME | DAY | MONTH | WEEK | HOURS | MINUTES | TOTAL MINUTES PASS | DATE_STRING | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-05-15 | 1BY6WEcLGh8j5v7 | 0.0 | 0.0 | 0.0 | 6259559.0 | 25.184316 | 22.857507 | 0.0 | 2020-05-15 | 00:00:00 | 15 | 5 | 20 | 0 | 0 | 0 | 2020-05-15 |
| 1 | 2020-05-15 | 1IF53ai7Xc0U56Y | 0.0 | 0.0 | 0.0 | 6183645.0 | 25.184316 | 22.857507 | 0.0 | 2020-05-15 | 00:00:00 | 15 | 5 | 20 | 0 | 0 | 0 | 2020-05-15 |
df_solar.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 68774 entries, 0 to 68773 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DATE_TIME 68774 non-null datetime64[ns] 1 SOURCE_KEY 68774 non-null object 2 DC_POWER 68774 non-null float64 3 AC_POWER 68774 non-null float64 4 DAILY_YIELD 68774 non-null float64 5 TOTAL_YIELD 68774 non-null float64 6 AMBIENT_TEMPERATURE 68774 non-null float64 7 MODULE_TEMPERATURE 68774 non-null float64 8 IRRADIATION 68774 non-null float64 9 DATE 68774 non-null object 10 TIME 68774 non-null object 11 DAY 68774 non-null int64 12 MONTH 68774 non-null int64 13 WEEK 68774 non-null int64 14 HOURS 68774 non-null object 15 MINUTES 68774 non-null int64 16 TOTAL MINUTES PASS 68774 non-null int64 17 DATE_STRING 68774 non-null object dtypes: datetime64[ns](1), float64(7), int64(5), object(5) memory usage: 10.0+ MB
df_solar.isnull().sum()
DATE_TIME 0 SOURCE_KEY 0 DC_POWER 0 AC_POWER 0 DAILY_YIELD 0 TOTAL_YIELD 0 AMBIENT_TEMPERATURE 0 MODULE_TEMPERATURE 0 IRRADIATION 0 DATE 0 TIME 0 DAY 0 MONTH 0 WEEK 0 HOURS 0 MINUTES 0 TOTAL MINUTES PASS 0 DATE_STRING 0 dtype: int64
df_solar.shape
(68774, 18)
# there are no nulls or blank values in the dataset
df_solar.describe().style.background_gradient(cmap='rainbow')
| DC_POWER | AC_POWER | DAILY_YIELD | TOTAL_YIELD | AMBIENT_TEMPERATURE | MODULE_TEMPERATURE | IRRADIATION | DAY | MONTH | WEEK | MINUTES | TOTAL MINUTES PASS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 68774.000000 | 68774.000000 | 68774.000000 | 68774.000000 | 68774.000000 | 68774.000000 | 68774.000000 | 68774.000000 | 68774.000000 | 68774.000000 | 68774.000000 | 68774.000000 |
| mean | 3147.177450 | 307.778375 | 3295.834644 | 6978727.511362 | 25.558521 | 31.244997 | 0.232305 | 15.762876 | 5.518539 | 22.549481 | 22.490621 | 716.515107 |
| std | 4036.441826 | 394.394865 | 3145.220597 | 416270.720885 | 3.361300 | 12.308283 | 0.301948 | 8.554460 | 0.499660 | 1.461138 | 16.772385 | 412.069969 |
| min | 0.000000 | 0.000000 | 0.000000 | 6183645.000000 | 20.398505 | 18.140415 | 0.000000 | 1.000000 | 5.000000 | 20.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 0.000000 | 6512006.826000 | 22.724491 | 21.123944 | 0.000000 | 9.000000 | 5.000000 | 21.000000 | 0.000000 | 360.000000 |
| 50% | 428.571429 | 41.450000 | 2658.473214 | 7146685.000000 | 24.670178 | 24.818984 | 0.031620 | 16.000000 | 6.000000 | 23.000000 | 15.000000 | 720.000000 |
| 75% | 6365.468750 | 623.561161 | 6274.000000 | 7268751.397000 | 27.960429 | 41.693659 | 0.454880 | 23.000000 | 6.000000 | 24.000000 | 30.000000 | 1065.000000 |
| max | 14471.125000 | 1410.950000 | 9163.000000 | 7846821.000000 | 35.252486 | 65.545714 | 1.221652 | 31.000000 | 6.000000 | 25.000000 | 45.000000 | 1425.000000 |
#from sklearn.preprocessing import LabelEncoder
#encoder = LabelEncoder()
#df_solar['SOURCE_KEY_NUMBER'] = encoder.fit_transform(df_solar['SOURCE_KEY'])
#df_solar.head()
#df_solar
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder
encoder = LabelEncoder()
# Encode the entire column first
encoded_values = encoder.fit_transform(df_solar['SOURCE_KEY'])
# Initialize the new column with default values
df_solar['SOURCE_KEY_NUMBER'] = -1
# Process data in chunks
chunk_size = 10000 # Define a chunk size
num_chunks = (len(df_solar) // chunk_size) + 1
for i in range(num_chunks):
start = i * chunk_size
end = start + chunk_size
if end > len(df_solar): # Ensure the last chunk does not exceed the length
end = len(df_solar)
df_solar.iloc[start:end, df_solar.columns.get_loc('SOURCE_KEY_NUMBER')] = encoded_values[start:end]
print(df_solar.head())
DATE_TIME SOURCE_KEY DC_POWER AC_POWER DAILY_YIELD TOTAL_YIELD \ 0 2020-05-15 1BY6WEcLGh8j5v7 0.0 0.0 0.0 6259559.0 1 2020-05-15 1IF53ai7Xc0U56Y 0.0 0.0 0.0 6183645.0 2 2020-05-15 3PZuoBAID5Wc2HD 0.0 0.0 0.0 6987759.0 3 2020-05-15 7JYdWkrLSPkdwr4 0.0 0.0 0.0 7602960.0 4 2020-05-15 McdE0feGgRqW7Ca 0.0 0.0 0.0 7158964.0 AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION DATE TIME \ 0 25.184316 22.857507 0.0 2020-05-15 00:00:00 1 25.184316 22.857507 0.0 2020-05-15 00:00:00 2 25.184316 22.857507 0.0 2020-05-15 00:00:00 3 25.184316 22.857507 0.0 2020-05-15 00:00:00 4 25.184316 22.857507 0.0 2020-05-15 00:00:00 DAY MONTH WEEK HOURS MINUTES TOTAL MINUTES PASS DATE_STRING \ 0 15 5 20 0 0 0 2020-05-15 1 15 5 20 0 0 0 2020-05-15 2 15 5 20 0 0 0 2020-05-15 3 15 5 20 0 0 0 2020-05-15 4 15 5 20 0 0 0 2020-05-15 SOURCE_KEY_NUMBER 0 0 1 1 2 2 3 3 4 4
file_name = 'df_solar.csv'
df_solar.to_csv(file_name, index=False)
df_solar['DATE'].nunique() #the data is daily for 34 days
34
solar_dc = df_solar.pivot_table(values='DC_POWER', index='TIME', columns='DATE')
def Daywise_plot_dc(data= None, row = None, col = None, title='DC Power'):
cols = data.columns # take all column
gp = plt.figure(figsize=(20,40))
gp.subplots_adjust(wspace=0.2, hspace=0.5)
for i in range(1, len(cols)+1):
ax = gp.add_subplot(row,col, i)
data[cols[i-1]].plot(ax=ax, color='red')
ax.set_title('{} {}'.format(title, cols[i-1]),color='blue')
Daywise_plot_dc(data=solar_dc, row=12, col=3)
solar_ac = df_solar.pivot_table(values='AC_POWER', index='TIME', columns='DATE')
def Daywise_plot_ac(data= None, row = None, col = None, title='AC Power'):
cols = data.columns # take all column
gp = plt.figure(figsize=(20,40))
gp.subplots_adjust(wspace=0.2, hspace=0.5)
for i in range(1, len(cols)+1):
ax = gp.add_subplot(row,col, i)
data[cols[i-1]].plot(ax=ax, color='red')
ax.set_title('{} {}'.format(title, cols[i-1]),color='blue')
Daywise_plot_ac(data=solar_ac, row=12, col=3)
def Daywise_plot_dc_ac(dc_data=None, ac_data=None, row=None, col=None, title='Power'):
cols = dc_data.columns # take all columns
gp = plt.figure(figsize=(20, 40))
gp.subplots_adjust(wspace=0.2, hspace=0.5)
for i in range(1, len(cols) + 1):
ax = gp.add_subplot(row, col, i)
dc_data[cols[i - 1]].plot(ax=ax, color='red', label='DC Power')
ac_data[cols[i - 1]].plot(ax=ax, color='green', label='AC Power')
ax.set_title('{} {}'.format(title, cols[i - 1]), color='blue')
ax.legend()
Daywise_plot_dc_ac(dc_data=solar_dc, ac_data=solar_ac, row=12, col=3)
daily_dc = df_solar.groupby('DATE')['DC_POWER'].agg('sum')
ax = daily_dc.sort_values(ascending=False).plot.bar(figsize=(17,5), legend=True,color='red')
plt.title('Daily DC Power')
plt.show()
daily_ac = df_solar.groupby('DATE')['AC_POWER'].agg('sum')
ax = daily_ac.sort_values(ascending=False).plot.bar(figsize=(17,5), legend=True,color='red')
plt.title('Daily AC Power')
plt.show()
# Combine both daily sums into a single DataFrame
daily_power = pd.DataFrame({'DC_POWER': daily_dc, 'AC_POWER': daily_ac})
# Plot the grouped bar plot
ax = daily_power.sort_values(by='DC_POWER', ascending=False).plot.bar(
figsize=(17, 5), color=['red', 'green']
)
plt.title('Daily DC and AC Power')
plt.xlabel('Date')
plt.ylabel('Power')
plt.legend(['DC Power', 'AC Power'])
plt.show()
solar_irradiation = df_solar.pivot_table(values='IRRADIATION', index='TIME', columns='DATE')
def Daywise_plot(data= None, row = None, col = None, title='IRRADIATION'):
cols = data.columns # take all column
gp = plt.figure(figsize=(20,40))
gp.subplots_adjust(wspace=0.2, hspace=0.5)
for i in range(1, len(cols)+1):
ax = gp.add_subplot(row,col, i)
data[cols[i-1]].plot(ax=ax, color='blue')
ax.set_title('{} {}'.format(title, cols[i-1]),color='blue')
Daywise_plot(data=solar_irradiation, row=12, col=3)
def Daywise_plot_dc_irradiation(dc_data=None, irrad_data=None, row=None, col=None, title='DC Vs Irradiation'):
cols = dc_data.columns # take all columns
gp = plt.figure(figsize=(20, 40))
gp.subplots_adjust(wspace=0.2, hspace=0.5)
for i in range(1, len(cols) + 1):
ax = gp.add_subplot(row, col, i)
dc_data[cols[i - 1]].plot(ax=ax, color='red', label='DC Power')
irrad_data[cols[i - 1]].plot(ax=ax, color='green', label='Irradiation')
ax.set_title('{} {}'.format(title, cols[i - 1]), color='blue')
ax.legend()
Daywise_plot_dc_irradiation(dc_data=solar_dc, irrad_data=solar_irradiation, row=12, col=3)
daily_irradiation = df_solar.groupby('DATE')['IRRADIATION'].agg('sum')
daily_irradiation.sort_values(ascending=False).plot.bar(figsize=(17,5), legend=True,color='blue')
plt.title('IRRADIATION')
plt.show()
# Combine both daily sums into a single DataFrame
daily_dcpower_irrad = pd.DataFrame({'DC_POWER': daily_dc, 'IRRADIATION': daily_irradiation})
# Plot the grouped bar plot
ax = daily_power.sort_values(by='DC_POWER', ascending=False).plot.bar(
figsize=(17, 5), color=['red', 'green']
)
plt.title('Daily DC Power and Irradiation')
plt.xlabel('Date')
plt.ylabel('Power Vs Irradiation')
plt.legend(['DC Power', 'Irradiation'])
plt.show()
#sns.displot(data=df_solar, x="AMBIENT_TEMPERATURE", kde=True, bins = 100,color = "red", facecolor = "#3F7F7F",height = 5, aspect = 3.5);
solar_ambiant_temp = df_solar.pivot_table(values='AMBIENT_TEMPERATURE', index='TIME', columns='DATE')
solar_module_temp = df_solar.pivot_table(values='MODULE_TEMPERATURE', index='TIME', columns='DATE')
def Daywise_plot_am_mo(am_data=None, mo_data=None, row=None, col=None, title='Ambient Vs Module'):
cols = am_data.columns # take all columns
gp = plt.figure(figsize=(20, 40))
gp.subplots_adjust(wspace=0.2, hspace=0.5)
for i in range(1, len(cols) + 1):
ax = gp.add_subplot(row, col, i)
am_data[cols[i - 1]].plot(ax=ax, color='red', label='Ambient Temperature')
mo_data[cols[i - 1]].plot(ax=ax, color='green', label='Module Temperature')
ax.set_title('{} {}'.format(title, cols[i - 1]), color='blue')
ax.legend()
Daywise_plot_am_mo(am_data=solar_ambiant_temp, mo_data=solar_module_temp, row=12, col=3)
daily_am = df_solar.groupby('DATE')['AMBIENT_TEMPERATURE'].agg('sum')
daily_mo = df_solar.groupby('DATE')['MODULE_TEMPERATURE'].agg('sum')
#daily_ambient_temp.sort_values(ascending=False).plot.bar(figsize=(17,5), legend=True,color='darkgreen')
#plt.title('AMBIENT_TEMPERATURE')
#plt.show()
daily_ambient_module_temp = pd.DataFrame({'AMBIENT_TEMPERATURE': daily_am, 'MODULE_TEMPERATURE': daily_mo})
# Plot the grouped bar plot
ax = daily_ambient_module_temp.sort_values(by='MODULE_TEMPERATURE', ascending=False).plot.bar(
figsize=(17, 5), color=['red', 'green']
)
plt.title('Daily Ambient and Module Temperature')
plt.xlabel('Date')
plt.ylabel('Ambient Vs Module')
plt.legend(['Ambient', 'Module'])
plt.show()
plt.figure(figsize=(16,16))
date=["2020-05-25"]
plt.subplot(411)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
y=df_solar[df_solar["DATE_STRING"].isin(date)].DC_POWER,
label="DC_Power_Best",
color='green')
plt.subplot(412)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
y=df_solar[df_solar["DATE_STRING"].isin(date)].AC_POWER,
label="AC_Power_Best",
color='green')
plt.subplot(413)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
y=df_solar[df_solar["DATE_STRING"].isin(date)].IRRADIATION,
label="Irridation_Best",
color='green');
plt.title("Irradiation : {}" .format(date[0]))
plt.subplot(414)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
y=df_solar[df_solar["DATE_STRING"].isin(date)].AMBIENT_TEMPERATURE,
label="Ambient_Temperature_Best",
color='green');
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
y=df_solar[df_solar["DATE_STRING"].isin(date)].MODULE_TEMPERATURE,
label="Module_Temperature_Best",
color='blue');
plt.title("Module Temperature & Ambient Temperature: {}" .format(date[0]));
plt.tight_layout()
plt.show()
date=["2020-05-18"]
plt.figure(figsize=(16,16))
plt.subplot(411)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
y=df_solar[df_solar["DATE_STRING"].isin(date)].DC_POWER,
label="DC_Power_Worst",
color='red');
plt.title("DC Power Generation: {}" .format(date[0]))
plt.subplot(412)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
y=df_solar[df_solar["DATE_STRING"].isin(date)].AC_POWER,
label="AC_Power_Worst",
color='red')
plt.subplot(413)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
y=df_solar[df_solar["DATE_STRING"].isin(date)].IRRADIATION,
label="Irridation_Worst",
color='red');
plt.title("Irradiation : {}" .format(date[0]))
plt.subplot(414)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
y=df_solar[df_solar["DATE_STRING"].isin(date)].AMBIENT_TEMPERATURE,
label="Ambient_Temperature_Worst",
color='red');
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
y=df_solar[df_solar["DATE_STRING"].isin(date)].MODULE_TEMPERATURE,
label="Module_Temperature_Worst",
color='blue');
plt.title("Module Temperature & Ambient Temperature: {}" .format(date[0]));
plt.tight_layout()
plt.show()
df_solar['SOURCE_KEY'].unique()
array(['1BY6WEcLGh8j5v7', '1IF53ai7Xc0U56Y', '3PZuoBAID5Wc2HD',
'7JYdWkrLSPkdwr4', 'McdE0feGgRqW7Ca', 'VHMLBKoKgIrUVDU',
'WRmjgnKYAwPKWDb', 'ZnxXDlPa8U1GXgE', 'ZoEaEvLYb1n2sOq',
'adLQvlD726eNBSB', 'bvBOhCH3iADSZry', 'iCRJl6heRkivqQ3',
'ih0vzX44oOqAx2f', 'pkci93gMrogZuBj', 'rGa61gmuvPhdLxV',
'sjndEbLyjtCKgGv', 'uHbuxQJl8lW7ozc', 'wCURE6d3bPkepu2',
'z9Y9gH1T5YWrNuG', 'zBIq5rxdHJRwDNY', 'zVJPv84UY57bAof',
'YxYtjZvoooNbGkE'], dtype=object)
filter_date = '2020-06-06 08:30:00'
filtered_df = df_solar[df_solar['DATE_TIME'] == filter_date]
# Create the plot
plt.figure(figsize=(14, 8))
# Plotting DC Power Output
plt.plot(filtered_df['SOURCE_KEY'], filtered_df['DC_POWER'], marker='o', linestyle='-', color='g', label='DC Power Input')
# Plotting AC Power Output
plt.plot(filtered_df['SOURCE_KEY'], filtered_df['AC_POWER'], marker='o', linestyle='-', color='r', label='AC Power Output')
# Adding title and labels
plt.title(f'Power vs Inverter ID on {filter_date}')
plt.xlabel('Inverter ID')
plt.ylabel('Power (kW)')
# Rotate x-axis labels for better readability
plt.xticks(rotation=90)
# Add legend
plt.legend()
# Display the plot
plt.grid(True)
plt.tight_layout()
plt.show()
solar_dc_power = df_solar[df_solar['DC_POWER'] > 0]['DC_POWER'].values
solar_ac_power = df_solar[df_solar['AC_POWER'] > 0]['AC_POWER'].values
solar_plant_eff = (np.max(solar_ac_power)/np.max(solar_dc_power ))*100
print(f"Power ratio AC/DC (Efficiency) of Solar Power Plant: {solar_plant_eff:0.3f} %")
Power ratio AC/DC (Efficiency) of Solar Power Plant: 9.750 %
AC_list=[]
for i in df_solar['AC_POWER']:
if i>0:
AC_list.append(i)
AC_list
#AC_list.sort()
#AC_list.reverse()
len(AC_list)
36823
DC_list=[]
for i in df_solar['DC_POWER']:
if i>0:
DC_list.append(i)
DC_list
DC_list.sort()
DC_list.reverse()
len(DC_list)
36823
plt.figure(figsize=(16,8))
AC_list.sort()
DC_list.sort()
#print(DC_list)
#DC_list.sort
#res = [i / 10 for i in AC_list]
eff = [i/j for i,j in zip(AC_list,DC_list)]
plt.plot(AC_list,eff,color='green')
plt.xlabel('Output power in kW')
plt.ylabel('efficiency AC/DC')
plt.title('Output power vs efficiency');
df2 = df_solar.copy()
X = df2[['DAILY_YIELD','TOTAL_YIELD','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','DC_POWER']]
y = df2['AC_POWER']
X.head()
| DAILY_YIELD | TOTAL_YIELD | AMBIENT_TEMPERATURE | MODULE_TEMPERATURE | IRRADIATION | DC_POWER | |
|---|---|---|---|---|---|---|
| 0 | 0.0 | 6259559.0 | 25.184316 | 22.857507 | 0.0 | 0.0 |
| 1 | 0.0 | 6183645.0 | 25.184316 | 22.857507 | 0.0 | 0.0 |
| 2 | 0.0 | 6987759.0 | 25.184316 | 22.857507 | 0.0 | 0.0 |
| 3 | 0.0 | 7602960.0 | 25.184316 | 22.857507 | 0.0 | 0.0 |
| 4 | 0.0 | 7158964.0 | 25.184316 | 22.857507 | 0.0 | 0.0 |
y.head()
0 0.0 1 0.0 2 0.0 3 0.0 4 0.0 Name: AC_POWER, dtype: float64
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=21)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
score_lr = 100*lr_clf.score(X_test,y_test)
print(f'LR Model score = {score_lr:4.4f}%')
LR Model score = 99.9995%
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred_lr = lr.predict(X_test)
R2_Score_lr = round(r2_score(y_pred_lr,y_test) * 100, 2)
print("R2 Score : ",R2_Score_lr,"%")
R2 Score : 100.0 %
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
y_pred_rfr = rfr.predict(X_test)
R2_Score_rfr = round(r2_score(y_pred_rfr,y_test) * 100, 2)
print("R2 Score : ",R2_Score_rfr,"%")
R2 Score : 100.0 %
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train,y_train)
y_pred_dtr = rfr.predict(X_test)
R2_Score_dtr = round(r2_score(y_pred_dtr,y_test) * 100, 2)
print("R2 Score : ",R2_Score_dtr,"%")
R2 Score : 100.0 %
prediction = rfr.predict(X_test)
print(prediction)
[ 0. 1071.24303578 299.62135714 ... 669.40498213 377.82001786 117.4835 ]
cross_checking = pd.DataFrame({'Actual' : y_test , 'Predicted' : prediction})
cross_checking.head()
| Actual | Predicted | |
|---|---|---|
| 43819 | 0.0000 | 0.000000 |
| 2949 | 1072.3250 | 1071.243036 |
| 33769 | 299.8125 | 299.621357 |
| 47825 | 0.0000 | 0.000000 |
| 29370 | 0.0000 | 0.000000 |
cross_checking['Error'] = cross_checking['Actual'] - cross_checking['Predicted']
cross_checking.head()
| Actual | Predicted | Error | |
|---|---|---|---|
| 43819 | 0.0000 | 0.000000 | 0.000000 |
| 2949 | 1072.3250 | 1071.243036 | 1.081964 |
| 33769 | 299.8125 | 299.621357 | 0.191143 |
| 47825 | 0.0000 | 0.000000 | 0.000000 |
| 29370 | 0.0000 | 0.000000 | 0.000000 |
cross_checking_final = cross_checking[cross_checking['Error'] <= 20]
cross_checking_final.sample(25).style.background_gradient(
cmap='coolwarm').set_properties(**{
'font-family': 'Lucida Calligraphy',
'color': 'LigntGreen',
'font-size': '15px'
})
| Actual | Predicted | Error | |
|---|---|---|---|
| 66617 | 0.000000 | 0.000000 | 0.000000 |
| 19323 | 0.000000 | 0.000000 | 0.000000 |
| 22096 | 817.942857 | 817.714625 | 0.228232 |
| 51487 | 0.000000 | 0.000000 | 0.000000 |
| 28800 | 0.000000 | 0.000000 | 0.000000 |
| 37613 | 0.000000 | 0.000000 | 0.000000 |
| 64553 | 0.000000 | 0.000000 | 0.000000 |
| 68601 | 0.000000 | 0.000000 | 0.000000 |
| 812 | 556.675000 | 556.546560 | 0.128440 |
| 41193 | 0.000000 | 0.000000 | 0.000000 |
| 51656 | 0.000000 | 0.000000 | 0.000000 |
| 40767 | 842.975000 | 842.777032 | 0.197968 |
| 15312 | 0.000000 | 0.000000 | 0.000000 |
| 68029 | 90.300000 | 90.308000 | -0.008000 |
| 20974 | 0.000000 | 0.000000 | 0.000000 |
| 22593 | 498.537500 | 498.462643 | 0.074857 |
| 54320 | 0.000000 | 0.000000 | 0.000000 |
| 55268 | 666.642857 | 666.609250 | 0.033607 |
| 15992 | 838.875000 | 839.413482 | -0.538482 |
| 26192 | 618.342857 | 618.319089 | 0.023768 |
| 21612 | 0.000000 | 0.000000 | 0.000000 |
| 5465 | 86.214286 | 86.242732 | -0.028446 |
| 37832 | 0.000000 | 0.000000 | 0.000000 |
| 36256 | 628.237500 | 628.244768 | -0.007268 |
| 40239 | 673.285714 | 673.324286 | -0.038571 |
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Build the MLP model
mlp_model = Sequential()
mlp_model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
mlp_model.add(Dense(32, activation='relu'))
mlp_model.add(Dense(1))
# Compile the model
mlp_model.compile(loss='mse', optimizer='adam')
# Train the model
mlp_model.fit(X_train_scaled, y_train, epochs=50, batch_size=10, verbose=1)
# Make predictions
y_pred_mlp = mlp_model.predict(X_test_scaled)
# Evaluate the model
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
r2_mlp = r2_score(y_test, y_pred_mlp)
print(f'MLP Model - MSE: {mse_mlp}, R2 Score: {r2_mlp}')
Epoch 1/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 277us/step - loss: 49272.4258 Epoch 2/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 264us/step - loss: 17.8594 Epoch 3/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 264us/step - loss: 1.5824 Epoch 4/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 306us/step - loss: 1.3549 Epoch 5/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 266us/step - loss: 1.9260 Epoch 6/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 264us/step - loss: 0.8639 Epoch 7/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 264us/step - loss: 1.1559 Epoch 8/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 275us/step - loss: 1.3298 Epoch 9/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 1.7006 Epoch 10/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 1.3854 Epoch 11/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 275us/step - loss: 1.3155 Epoch 12/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 0.9738 Epoch 13/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 266us/step - loss: 0.7780 Epoch 14/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 0.9318 Epoch 15/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 276us/step - loss: 0.9338 Epoch 16/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 0.8152 Epoch 17/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 0.6181 Epoch 18/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 276us/step - loss: 1.6709 Epoch 19/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 267us/step - loss: 0.9457 Epoch 20/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 294us/step - loss: 1.6886 Epoch 21/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 276us/step - loss: 1.5974 Epoch 22/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 266us/step - loss: 0.6508 Epoch 23/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 266us/step - loss: 1.4363 Epoch 24/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 267us/step - loss: 1.3786 Epoch 25/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 279us/step - loss: 0.9376 Epoch 26/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 268us/step - loss: 0.8532 Epoch 27/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.9270 Epoch 28/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 268us/step - loss: 0.7021 Epoch 29/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.8422 Epoch 30/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 281us/step - loss: 1.5426 Epoch 31/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.9313 Epoch 32/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 268us/step - loss: 0.8556 Epoch 33/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 270us/step - loss: 1.1805 Epoch 34/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 280us/step - loss: 0.7001 Epoch 35/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 271us/step - loss: 0.7493 Epoch 36/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 299us/step - loss: 1.5413 Epoch 37/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 288us/step - loss: 0.7257 Epoch 38/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.6937 Epoch 39/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.8396 Epoch 40/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 271us/step - loss: 0.7010 Epoch 41/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 282us/step - loss: 0.7524 Epoch 42/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 270us/step - loss: 0.9560 Epoch 43/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.5954 Epoch 44/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 280us/step - loss: 0.8679 Epoch 45/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 268us/step - loss: 1.2432 Epoch 46/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 270us/step - loss: 1.5490 Epoch 47/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 281us/step - loss: 0.8518 Epoch 48/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 1.0041 Epoch 49/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 270us/step - loss: 1.1406 Epoch 50/50 5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 281us/step - loss: 0.4706 430/430 ━━━━━━━━━━━━━━━━━━━━ 0s 237us/step MLP Model - MSE: 0.17525856311964316, R2 Score: 0.999998865808189